## 'data.frame': 39644 obs. of 62 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
## $ Popular : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 1 1 2 1 ...
list_cols_cnt=1
clist <- array()
for (i in 3:(ncol(super3_alt_ds)-1)) {
if ( length(which(super3_alt_ds[,i] <0)) > 0) {
print(paste(i,names(super3_alt_ds)[i],length(which(super3_alt_ds[,i] < 0))))
clist[list_cols_cnt] <- i
list_cols_cnt <- list_cols_cnt + 1
}
}## [1] "20 kw_min_min 22980"
## [1] "22 kw_avg_min 833"
## [1] "26 kw_min_avg 6"
## [1] "46 global_sentiment_polarity 3264"
## [1] "54 avg_negative_polarity 37094"
## [1] "55 min_negative_polarity 37094"
## [1] "56 max_negative_polarity 37094"
## [1] "58 title_sentiment_polarity 5786"
datachannel_subsets_ds <- function(data_channel) {
ds_index <- which(colnames(super3_alt_ds)==data_channel)
dc_subset <- super3_alt_ds %>% filter(super3_alt_ds[,ds_index]==1)
dc_subset <- dc_subset[,-which(colnames(super3_alt_ds) %in% non_perdictor_cols)]
}
linear_assumptions <- function(model,dataset) {
# Select only numeric predictors
probabilities <- predict(model, type = "response")
mydata <- dataset %>% dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>% mutate(logit = log(probabilities/(1-probabilities))) %>% gather(key = "predictors", value = "predictor.value", -logit)
model_preds <- names(model$coefficients)[-1]
myplots <- list()
for(i in 1:length(model_preds)) {
myplots[[i]] <- mydata %>%
filter(predictors==model_preds[i]) %>%
ggplot(aes(x=predictor.value, y=logit))+
geom_point(size = 0.5, alpha = 0.5) +
xlab(model_preds[i])+
geom_smooth(method = "lm") +
theme_bw()
}
n <- length(myplots)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(myplots, ncol=nCol))
} create SMOTE subset function to handle unbalanced response variable
subset_ds <- datachannel_subsets_ds('data_channel_is_world')
# Unbalanced Dataset
count(subset_ds,Popular)smote_world_ds <- SMOTE(Popular ~ ., subset_ds[,-45], perc.over = 120,perc.under=205)
# Balanced Dataset
count(smote_world_ds,Popular) Build a model and make predictions using stepwise regression
objective1_predictors <-c('global_subjectivity','num_self_hrefs','rate_positive_words',
'average_token_length','num_hrefs','num_imgs','n_tokens_title','n_unique_tokens',
'LDA_04','rate_negative_words','kw_max_min','kw_min_max','kw_max_max',
'kw_avg_max','kw_max_avg','kw_avg_avg')
set.seed(123)
training.samples <- smote_world_ds$Popular %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- smote_world_ds[training.samples,]
test.data <- smote_world_ds[-training.samples,]
formula <- as.formula(paste('Popular',paste(objective1_predictors, collapse = " + "),sep = " ~ "))
model <- glm(formula, data=train.data, family=binomial) %>% stepAIC(trace = FALSE,direction="backward")Model Summary
##
## Call:
## glm(formula = Popular ~ global_subjectivity + num_self_hrefs +
## rate_positive_words + average_token_length + num_hrefs +
## num_imgs + n_tokens_title + n_unique_tokens + LDA_04 + kw_min_max +
## kw_max_max + kw_avg_max + kw_max_avg + kw_avg_avg, family = binomial,
## data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3380 -1.0920 -0.7211 1.1500 2.0820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.165e-01 2.125e-01 -2.431 0.015076 *
## global_subjectivity 1.990e+00 2.801e-01 7.105 1.20e-12 ***
## num_self_hrefs -2.153e-02 9.072e-03 -2.374 0.017608 *
## rate_positive_words 6.612e-01 1.504e-01 4.395 1.11e-05 ***
## average_token_length -5.343e-01 4.480e-02 -11.926 < 2e-16 ***
## num_hrefs 2.644e-02 2.956e-03 8.944 < 2e-16 ***
## num_imgs 3.754e-02 5.196e-03 7.225 5.02e-13 ***
## n_tokens_title 3.045e-02 1.062e-02 2.868 0.004124 **
## n_unique_tokens 1.107e+00 2.940e-01 3.766 0.000166 ***
## LDA_04 6.245e-01 1.319e-01 4.735 2.19e-06 ***
## kw_min_max -4.725e-06 1.507e-06 -3.135 0.001721 **
## kw_max_max -5.284e-07 1.542e-07 -3.427 0.000610 ***
## kw_avg_max -1.805e-06 3.685e-07 -4.899 9.66e-07 ***
## kw_max_avg -9.030e-05 9.925e-06 -9.098 < 2e-16 ***
## kw_avg_avg 6.896e-04 5.434e-05 12.692 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13186 on 9512 degrees of freedom
## Residual deviance: 12529 on 9498 degrees of freedom
## AIC: 12559
##
## Number of Fisher Scoring iterations: 4
Predictions and Confusion matrix
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Y", "N")
# Model accuracy
mean(predicted.classes==test.data$Popular)## [1] 0.6083298
cm_table <-table(predicted.classes, test.data$Popular)[c(2,1),c(2,1)]
## Confunsion Matrix
CM <- confusionMatrix(cm_table)
CM## Confusion Matrix and Statistics
##
##
## predicted.classes Y N
## Y 649 406
## N 525 797
##
## Accuracy : 0.6083
## 95% CI : (0.5884, 0.628)
## No Information Rate : 0.5061
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.2156
##
## Mcnemar's Test P-Value : 0.00011
##
## Sensitivity : 0.5528
## Specificity : 0.6625
## Pos Pred Value : 0.6152
## Neg Pred Value : 0.6029
## Prevalence : 0.4939
## Detection Rate : 0.2730
## Detection Prevalence : 0.4438
## Balanced Accuracy : 0.6077
##
## 'Positive' Class : Y
##
Goodness of fit Test
# Goodness of fit test
trainingPopular <- if_else(train.data$Popular=='Y',1,0)
hoslem.test(trainingPopular, fitted(model),g = 10)##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: trainingPopular, fitted(model)
## X-squared = 51.186, df = 8, p-value = 2.416e-08
Relationship between predictor and Logit
Influential points Analysis
There are no influential points in this part of dataset
# Extract model results
model.data <- augment(model) %>%
mutate(index = 1:n())
model.data %>% top_n(3, .cooksd)ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = Popular), alpha = .5) +
theme_bw()create SMOTE subset function to handle unbalanced response variable
subset_ds <- datachannel_subsets_ds('data_channel_is_tech')
# Unbalanced Dataset
count(subset_ds,Popular)smote_tech_ds <- SMOTE(Popular ~ ., subset_ds[,-45], perc.over = 120,perc.under=205)
# Balanced Dataset
count(smote_world_ds,Popular) Build a model and make predictions using stepwise regression
objective1_predII <- c('global_subjectivity','num_self_hrefs','rate_positive_words',
'average_token_length','num_hrefs','n_tokens_title','n_unique_tokens',
'LDA_04','rate_negative_words','kw_max_min','kw_min_max',
'kw_max_avg','kw_avg_avg')
set.seed(123)
training.samples <- smote_tech_ds$Popular %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- smote_tech_ds[training.samples,]
test.data <- smote_tech_ds[-training.samples,]
formula <- as.formula(paste('Popular',paste(objective1_predII, collapse = " + "),sep = " ~ "))
model <- glm(formula, data=train.data, family=binomial) %>% stepAIC(trace = FALSE,direction="backward")Model Summary
##
## Call:
## glm(formula = Popular ~ global_subjectivity + num_self_hrefs +
## rate_positive_words + num_hrefs + n_tokens_title + n_unique_tokens +
## rate_negative_words + kw_max_avg + kw_avg_avg, family = binomial,
## data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3728 -1.1306 0.5413 1.1427 1.9985
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.016e-02 5.996e-01 0.050 0.9599
## global_subjectivity 1.262e+00 3.107e-01 4.060 4.91e-05 ***
## num_self_hrefs -3.649e-02 5.613e-03 -6.502 7.95e-11 ***
## rate_positive_words -1.379e+00 6.190e-01 -2.228 0.0259 *
## num_hrefs 3.855e-02 4.249e-03 9.074 < 2e-16 ***
## n_tokens_title -2.582e-02 1.036e-02 -2.493 0.0127 *
## n_unique_tokens -1.485e+00 2.317e-01 -6.408 1.47e-10 ***
## rate_negative_words -8.865e-01 6.241e-01 -1.420 0.1555
## kw_max_avg -5.684e-05 1.165e-05 -4.878 1.07e-06 ***
## kw_avg_avg 6.637e-04 4.823e-05 13.762 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13416 on 9678 degrees of freedom
## Residual deviance: 12854 on 9669 degrees of freedom
## AIC: 12874
##
## Number of Fisher Scoring iterations: 4
Predictions and Confusion matrix
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Y", "N")
# Model accuracy
mean(predicted.classes==test.data$Popular)## [1] 0.5905707
cm_table <-table(predicted.classes, test.data$Popular)[c(2,1),c(2,1)]
## Confunsion Matrix
CM <- confusionMatrix(cm_table)
CM## Confusion Matrix and Statistics
##
##
## predicted.classes Y N
## Y 719 485
## N 505 709
##
## Accuracy : 0.5906
## 95% CI : (0.5707, 0.6103)
## No Information Rate : 0.5062
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.1812
##
## Mcnemar's Test P-Value : 0.5459
##
## Sensitivity : 0.5874
## Specificity : 0.5938
## Pos Pred Value : 0.5972
## Neg Pred Value : 0.5840
## Prevalence : 0.5062
## Detection Rate : 0.2974
## Detection Prevalence : 0.4979
## Balanced Accuracy : 0.5906
##
## 'Positive' Class : Y
##
Goodness of fit Test
# Goodness of fit test
trainingPopular <- if_else(train.data$Popular=='Y',1,0)
hoslem.test(trainingPopular, fitted(model),g = 10)##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: trainingPopular, fitted(model)
## X-squared = 35.088, df = 8, p-value = 2.577e-05
Relationship between predictor and Logit
Influential points Analysis
There are no influential points in this part of dataset
# Extract model results
model.data <- augment(model) %>% mutate(index = 1:n())
model.data %>%
filter(abs(.std.resid) > 3)ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = Popular), alpha = .5) +
theme_bw()